rmarkdown::render(‘./1_JP_scripts/1_QC_filtering_metrics_HARDAC.Rmd’)

Changes in myeloid and kidney cells after renal ischemia - Analysis of 2 x 10X scRNA-seq samples from 2 pools of WT (3 Sham + 3 CLP): comparison of gene expression in sham vs CLP in different cell populations

SECTION 1:

LOAD DATA

# Load the Cell Ranger Matrix Data and create the base Seurat
# object.  we initialize the Seurat object
# (`CreateSeuratObject`) with the raw (non-normalized data).

library(Seurat)
dataset_loc <- "./rawData/"
ids <- c("C0", "C24")

d10x.data <- sapply(ids, function(i) {
    d10x <- Read10X(file.path(dataset_loc, i, "filtered_feature_bc_matrix/"))
    colnames(d10x) <- paste(sapply(strsplit(colnames(d10x), split = "-"), 
        "[[", 1L), i, sep = "--")
    d10x
})

experiment.data <- do.call("cbind", d10x.data)

experiment.aggregate <- CreateSeuratObject(experiment.data, project = "ShamCLP-201008", 
    names.field = 2, names.delim = "\\-\\-")
experiment.aggregate
## An object of class Seurat 
## 32285 features across 25133 samples within 1 assay 
## Active assay: RNA (32285 features, 0 variable features)

SECTION 2:

ANNOTATE THE DATASET

Calc mitocondrial content

Calculate percent UMIs mapping to the mitochondrial genome per cell

experiment.aggregate[["percent.mito"]] <- PercentageFeatureSet(experiment.aggregate, 
    pattern = "^mt-")
summary(experiment.aggregate[["percent.mito"]])
##   percent.mito    
##  Min.   : 0.3163  
##  1st Qu.:16.7708  
##  Median :35.5812  
##  Mean   :41.0215  
##  3rd Qu.:59.7707  
##  Max.   :99.0097

Modify sample names

The original samples names (the names above in ids) can be found in the metadata slot, column orig.ident.

samplename = experiment.aggregate@meta.data$orig.ident
table(samplename)
## samplename
##    C0   C24 
## 14408 10725

Modify sample names to include phenotypic information

sample.id = rep("C0", length(samplename))
sample.id[samplename %in% c("C24")] = "C24"
names(sample.id) = rownames(experiment.aggregate@meta.data)

Another way of adding an annotation to the Seurat object

experiment.aggregate <- AddMetaData(object = experiment.aggregate, 
    metadata = sample.id, col.name = "sample.id")

table(sample.id)
## sample.id
##    C0   C24 
## 14408 10725

SECTION 3:

QC METRICS WITH SEURAT

outdir <- "./processedData/1_JP_analyses_results/Rerun_HARDAC_20210216/1_QC_filtering_metrics"
dir.create(outdir, recursive = T)
colors <- c("#12999E", "#FDA908")
names(colors) <- levels(as.factor(sample.id))
v <- VlnPlot(experiment.aggregate, features = c("nFeature_RNA", 
    "nCount_RNA", "percent.mito"), ncol = 3, cols = colors, group.by = "sample.id")
v

pdf(paste0(outdir, "/1_VlnPlots_nFeature_RNA_nCount_RNA_percent.mito_per_sample_before_filtering.pdf"), 
    width = 21, height = 9)
v
dev.off()
## png 
##   2
f <- FeatureScatter(experiment.aggregate, feature1 = "nCount_RNA", 
    feature2 = "percent.mito", cols = colors, group.by = "sample.id")
f

pdf(paste0(outdir, "/2_FeatureScatter_percent.mito_vs_nCount_RNA_per_sample_before_filtering.pdf"))
f
dev.off()
## png 
##   2
f <- FeatureScatter(experiment.aggregate, feature1 = "nCount_RNA", 
    feature2 = "nFeature_RNA", cols = colors, group.by = "sample.id")
f

pdf(paste0(outdir, "/3_FeatureScatter_nFeature_RNA_vs_nCount_RNA_per_sample_before_filtering.pdf"))
f
dev.off()
## png 
##   2

Library size

hist(experiment.aggregate$nCount_RNA, xlab = "Library sizes", 
    breaks = 100, col = "grey80", ylab = "Number of cells", main = "Distribution of library sizes
(= total number of transcripts per cell = $nCount_RNA)", 
    cex.main = 1)
abline(v = 2000, col = "red")

pdf(paste0(outdir, "/4_Library_size.pdf"), width = 8, height = 9)
hist(experiment.aggregate$nCount_RNA, xlab = "Library sizes", 
    breaks = 100, col = "grey80", ylab = "Number of cells", main = "Distribution of library sizes
(= total number of transcripts per cell = $nCount_RNA)", 
    cex.main = 1)
abline(v = 2000, col = "red")
dev.off()
## png 
##   2
saveRDS(experiment.aggregate, paste0(outdir, "/5.experiment.aggregate.158.rds"))

QC with scater

# experiment.aggregate <-
# readRDS('./processedData/1_QC_filtering_metrics/5.experiment.aggregate.158.rds'')
experiment.aggregate.sce <- as.SingleCellExperiment(experiment.aggregate)
experiment.aggregate.sce
## class: SingleCellExperiment 
## dim: 32285 25133 
## metadata(0):
## assays(2): counts logcounts
## rownames(32285): Xkr4 Gm1992 ... AC234645.1 AC149090.1
## rowData names(0):
## colnames(25133): AAACCCAAGAGGGTGG--C0 AAACCCAAGATGGCGT--C0 ...
##   TTTGTTGTCTAAACGC--C24 TTTGTTGTCTCGACCT--C24
## colData names(6): orig.ident nCount_RNA ... sample.id ident
## reducedDimNames(0):
## altExpNames(0):
library(scater)
per.cell <- perCellQCMetrics(experiment.aggregate.sce, subsets = list(Mito = grep("^mt-", 
    rownames(experiment.aggregate.sce))))
summary(per.cell$sum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     500    2753    5354    7639   10682   63817
summary(experiment.aggregate$nCount_RNA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     500    2753    5354    7639   10682   63817
summary(per.cell$detected)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      70     566    1192    1497    2200    7841
summary(per.cell$subsets_Mito_percent)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3163 16.7708 35.5812 41.0215 59.7707 99.0097
colData(experiment.aggregate.sce) <- cbind(colData(experiment.aggregate.sce), 
    per.cell)
p <- plotColData(experiment.aggregate.sce, x = "sum", y = "detected", 
    colour_by = "sample.id")
p

pdf(paste0(outdir, "/6_plotColData_detected_vs_sum.pdf"), width = 9, 
    height = 8)
p
dev.off()
## png 
##   2
p <- plotColData(experiment.aggregate.sce, x = "sum", y = "subsets_Mito_percent", 
    other_fields = "sample.id") + facet_wrap(~sample.id)
p

pdf(paste0(outdir, "/7_plotColData_subsets_Mito_percent_vs_sum.pdf"), 
    width = 15, height = 9)
p
dev.off()
## png 
##   2
qc.stats <- quickPerCellQC(per.cell, percent_subsets = "subsets_Mito_percent")
colData(experiment.aggregate.sce) <- cbind(colData(experiment.aggregate.sce), 
    qc.stats)
colSums(as.matrix(qc.stats))
##              low_lib_size            low_n_features high_subsets_Mito_percent 
##                         0                         0                         0 
##                   discard 
##                         0
experiment.aggregate.sce <- runColDataPCA(experiment.aggregate.sce, 
    variables = c("sum", "detected", "subsets_Mito_percent"))
p <- plotReducedDim(experiment.aggregate.sce, dimred = "PCA_coldata", 
    colour_by = "discard", size_by = "subsets_Mito_percent", 
    percentVar = 100 * attr(reducedDim(experiment.aggregate.sce), 
        "percentVar")) + ggtitle("PCA based on QC metrics / Detected outlier cells are highlighted") + 
    theme(plot.title = element_text(size = 7), legend.title = element_text(size = 7), 
        legend.text = element_text(size = 7))
p

pdf(paste0(outdir, "/8_plotReducedDim_QC_metrics.pdf"), width = 9, 
    height = 9)
p
dev.off()
## png 
##   2

Manual cell filtering

Now we can define a cell filter based on our previous analysis:

filter_by_expr_features <- (experiment.aggregate.sce$detected >= 
    200 & experiment.aggregate.sce$detected <= 9000)
table(filter_by_expr_features)
## filter_by_expr_features
## FALSE  TRUE 
##  1388 23745
filter_by_total_counts <- (experiment.aggregate.sce$sum >= 500 & 
    experiment.aggregate.sce$sum < 150000)
table(filter_by_total_counts)
## filter_by_total_counts
##  TRUE 
## 25133
filter_by_MT <- experiment.aggregate.sce$subsets_Mito_percent <= 
    55
table(filter_by_MT)
## filter_by_MT
## FALSE  TRUE 
##  7067 18066
# filtered <- experiment.aggregate.sce[,!qc.stats$discard]
experiment.aggregate.sce$use <- (
    # sufficient features (genes)
    filter_by_expr_features &
    # sufficient molecules counted
    filter_by_total_counts &
    # remove cells with unusual number of reads in MT genes
    filter_by_MT
)
table(experiment.aggregate.sce$use)
## 
## FALSE  TRUE 
##  7078 18055
experiment.aggregate.sce$discard <- !experiment.aggregate.sce$use
table(experiment.aggregate.sce$discard)
## 
## FALSE  TRUE 
## 18055  7078
experiment.aggregate.sce <- runColDataPCA(experiment.aggregate.sce, 
    variables = c("sum", "detected", "subsets_Mito_percent"))
p <- plotReducedDim(experiment.aggregate.sce, dimred = "PCA_coldata", 
    colour_by = "discard", size_by = "subsets_Mito_percent", 
    percentVar = 100 * attr(reducedDim(experiment.aggregate.sce), 
        "percentVar")) + ggtitle("PCA based on QC metrics / Manual outlier cells are highlighted") + 
    theme(plot.title = element_text(size = 7), legend.title = element_text(size = 7), 
        legend.text = element_text(size = 7))
p

pdf(paste0(outdir, "/9_plotReducedDim_QC_metrics_manual_filtering.pdf"), 
    width = 9, height = 9)
p
dev.off()
## png 
##   2
filtered <- experiment.aggregate.sce[, experiment.aggregate.sce$use]
p <- plotColData(filtered, x = "sum", y = "subsets_Mito_percent", 
    other_fields = "sample.id") + facet_wrap(~sample.id)
p

pdf(paste0(outdir, "/10_plotColData_subsets_Mito_percent_vs_sum_post_filtering.pdf"), 
    width = 15, height = 9)
p
dev.off()
## png 
##   2
dim(filtered)
## [1] 32285 18055
table(filtered$sample.id)
## 
##    C0   C24 
## 11131  6924
summary(filtered$detected)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   206.0   836.5  1538.0  1792.9  2576.0  7841.0
summary(filtered$subsets_Mito_percent)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3163 11.3079 25.7262 25.6500 38.2506 54.9932
per.feat <- perFeatureQCMetrics(experiment.aggregate.sce)
summary(per.feat$mean)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0000   0.0009   0.2366   0.0480 579.3806
summary(per.feat$detected)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   0.00000   0.00000   0.07958   4.63628   4.00668 100.00000
ave <- calculateAverage(experiment.aggregate.sce)
summary(ave)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0000   0.0011   0.2366   0.0525 573.4458
summary(nexprs(experiment.aggregate.sce, byrow = TRUE))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0      20    1165    1007   25133
plotHighestExprs(experiment.aggregate.sce, exprs_values = "counts")

pdf(paste0(outdir, "/11_plotHighestExprs.pdf"), width = 7, height = 7)
plotHighestExprs(experiment.aggregate.sce, exprs_values = "counts")
dev.off()
## png 
##   2
keep_feature <- nexprs(experiment.aggregate.sce, byrow = TRUE) > 
    0
filtered <- experiment.aggregate.sce[keep_feature, experiment.aggregate.sce$use]
dim(filtered)
## [1] 22399 18055

Explanatory factors

We can investigate the relative importance of different explanatory factors with the plotExplanatoryVariables function. This function computes the percentage of variance in gene expression that is explained by variables in the sample-level metadata. It allows problematic factors to be quickly identified, as well as the genes that are most affected. This is best done on the log-expression values to reduce the effect of the mean on the variance - hence, we run normalize first. This can take a while to calculate and plot.

experiment.aggregate.sce <- logNormCounts(experiment.aggregate.sce)
vars <- getVarianceExplained(experiment.aggregate.sce, variables = c("sample.id", 
    "sum", "detected", "subsets_Mito_percent"))
head(vars)
##           sample.id          sum    detected subsets_Mito_percent
## Xkr4    0.006943212 0.0003875252 0.010707344          0.010367705
## Gm1992          NaN          NaN         NaN                  NaN
## Gm19938 0.006475890 0.0036193873 0.009556241          0.004387266
## Gm37381 0.008258246 0.0574859723 0.084895733          0.001749579
## Rp1     0.114455198 0.0838337212 0.198380080          0.044357580
## Sox17   0.046244971 0.3358676296 0.105142397          2.814385055

Plot explanatory variables ordered by percentage of variance explained

plotExplanatoryVariables(vars)

filtered <- logNormCounts(filtered)
vars <- getVarianceExplained(filtered, variables = c("sample.id", 
    "sum", "detected", "subsets_Mito_percent"))
head(vars)
##           sample.id          sum     detected subsets_Mito_percent
## Xkr4    0.007104581 0.0002976112 5.734868e-03         0.0024295009
## Gm19938 0.006728322 0.0036912228 5.271952e-03         0.0002947912
## Gm37381 0.009610374 0.0630397123 9.446692e-02         0.0010401689
## Rp1     0.114915087 0.0836476322 1.610593e-01         0.0005540464
## Sox17   0.167635588 0.4421780293 1.115945e-02         3.5530867920
## Gm37587 0.007028329 0.0136398415 3.062177e-05         0.0583494803
plotExplanatoryVariables(vars)

Using manual filters to filter Seurat object

filtered <- experiment.aggregate[keep_feature, experiment.aggregate.sce$use]
dim(filtered)
## [1] 22399 18055

QC metrics with Seurat post-filtering

v <- VlnPlot(filtered, features = c("nFeature_RNA", "nCount_RNA", 
    "percent.mito"), ncol = 3, cols = colors, group.by = "sample.id")
v

pdf(paste0(outdir, "/12_VlnPlots_nFeature_RNA_nCount_RNA_percent.mito_per_sample_post_filtering.pdf"), 
    width = 21, height = 9)
v
dev.off()
## png 
##   2
f <- FeatureScatter(filtered, feature1 = "nCount_RNA", feature2 = "percent.mito", 
    cols = colors, group.by = "sample.id")
f

pdf(paste0(outdir, "/13_FeatureScatter_percent.mito_vs_nCount_RNA_per_sample_post_filtering.pdf"))
f
dev.off()
## png 
##   2
f <- FeatureScatter(filtered, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", 
    cols = colors, group.by = "sample.id")
f

pdf(paste0(outdir, "/14_FeatureScatter_nFeature_RNA_vs_nCount_RNA_per_sample_post_filtering.pdf"))
f
dev.off()
## png 
##   2
saveRDS(filtered, paste0(outdir, "/15.filtered.398.rds"))

Session Information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server release 6.8 (Santiago)
## 
## Matrix products: default
## BLAS:   /gpfs/fs1/data/omicscore/Privratsky-Privratsky-20210215/scripts/conda/envs/privratsky/lib/libblas.so.3.8.0
## LAPACK: /gpfs/fs1/data/omicscore/Privratsky-Privratsky-20210215/scripts/conda/envs/privratsky/lib/liblapack.so.3.8.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] scater_1.18.0               ggplot2_3.3.3              
##  [3] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
##  [5] Biobase_2.50.0              GenomicRanges_1.42.0       
##  [7] GenomeInfoDb_1.26.0         IRanges_2.24.0             
##  [9] S4Vectors_0.28.0            BiocGenerics_0.36.0        
## [11] MatrixGenerics_1.2.0        matrixStats_0.58.0         
## [13] SeuratObject_4.0.0          Seurat_4.0.0               
## 
## loaded via a namespace (and not attached):
##   [1] ggbeeswarm_0.6.0          Rtsne_0.15               
##   [3] colorspace_2.0-0          deldir_0.2-9             
##   [5] ellipsis_0.3.1            ggridges_0.5.3           
##   [7] scuttle_1.0.0             XVector_0.30.0           
##   [9] BiocNeighbors_1.8.0       spatstat.data_2.0-0      
##  [11] farver_2.0.3              leiden_0.3.7             
##  [13] listenv_0.8.0             ggrepel_0.9.1            
##  [15] sparseMatrixStats_1.2.0   codetools_0.2-18         
##  [17] splines_4.0.3             knitr_1.31               
##  [19] polyclip_1.10-0           jsonlite_1.7.2           
##  [21] ica_1.0-2                 cluster_2.1.1            
##  [23] png_0.1-7                 uwot_0.1.10              
##  [25] shiny_1.6.0               sctransform_0.3.2        
##  [27] compiler_4.0.3            httr_1.4.2               
##  [29] Matrix_1.3-2              fastmap_1.1.0            
##  [31] lazyeval_0.2.2            formatR_1.7              
##  [33] BiocSingular_1.6.0        later_1.1.0.1            
##  [35] htmltools_0.5.1.1         tools_4.0.3              
##  [37] rsvd_1.0.3                igraph_1.2.6             
##  [39] gtable_0.3.0              glue_1.4.2               
##  [41] GenomeInfoDbData_1.2.4    RANN_2.6.1               
##  [43] reshape2_1.4.4            dplyr_1.0.4              
##  [45] Rcpp_1.0.6                spatstat_1.64-1          
##  [47] scattermore_0.7           vctrs_0.3.6              
##  [49] nlme_3.1-152              DelayedMatrixStats_1.12.0
##  [51] lmtest_0.9-38             xfun_0.20                
##  [53] stringr_1.4.0             globals_0.14.0           
##  [55] beachmat_2.6.0            mime_0.10                
##  [57] miniUI_0.1.1.1            lifecycle_1.0.0          
##  [59] irlba_2.3.3               goftest_1.2-2            
##  [61] future_1.21.0             MASS_7.3-53.1            
##  [63] zlibbioc_1.36.0           zoo_1.8-8                
##  [65] scales_1.1.1              promises_1.2.0.1         
##  [67] spatstat.utils_2.0-0      RColorBrewer_1.1-2       
##  [69] yaml_2.2.1                reticulate_1.18          
##  [71] pbapply_1.4-3             gridExtra_2.3            
##  [73] rpart_4.1-15              stringi_1.5.3            
##  [75] highr_0.8                 BiocParallel_1.24.0      
##  [77] rlang_0.4.10              pkgconfig_2.0.3          
##  [79] bitops_1.0-6              evaluate_0.14            
##  [81] lattice_0.20-41           ROCR_1.0-11              
##  [83] purrr_0.3.4               tensor_1.5               
##  [85] labeling_0.4.2            patchwork_1.1.1          
##  [87] htmlwidgets_1.5.3         cowplot_1.1.1            
##  [89] tidyselect_1.1.0          parallelly_1.23.0        
##  [91] RcppAnnoy_0.0.18          plyr_1.8.6               
##  [93] magrittr_2.0.1            R6_2.5.0                 
##  [95] generics_0.1.0            DelayedArray_0.16.0      
##  [97] withr_2.4.1               pillar_1.4.7             
##  [99] mgcv_1.8-33               fitdistrplus_1.1-3       
## [101] survival_3.2-7            abind_1.4-5              
## [103] RCurl_1.98-1.2            tibble_3.0.6             
## [105] future.apply_1.7.0        crayon_1.4.1             
## [107] KernSmooth_2.23-18        plotly_4.9.3             
## [109] rmarkdown_2.6             viridis_0.5.1            
## [111] grid_4.0.3                data.table_1.13.6        
## [113] digest_0.6.27             xtable_1.8-4             
## [115] tidyr_1.1.2               httpuv_1.5.5             
## [117] munsell_0.5.0             beeswarm_0.2.3           
## [119] viridisLite_0.3.0         vipor_0.4.5
writeLines(capture.output(sessionInfo()), "./scripts/1_JP_scripts/1_QC_filtering_metrics_HARDAC.sessionInfo.txt")